#pragma OPENCL EXTENSION cl_arm_printf : enable
#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable

__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;

__kernel void matmul_int8_lbriobntnp(__global char * out_buf, __global char4 *lhs_buf,
                                     __read_only image2d_t rhs_tex, int4 hwkt) {
    int height = hwkt.x, width = hwkt.y, kslice = hwkt.z/4;
    int h = get_global_id(0);
    int w = get_global_id(1);

    // 203 ms
    int acc = 0;
    for(int i=0; i<kslice; i+=4) {
        acc += arm_dot(lhs_buf[h*kslice + i + 0], convert_char4(read_imagei(rhs_tex, smp_none, (int2)(i+0, w))));
        acc += arm_dot(lhs_buf[h*kslice + i + 1], convert_char4(read_imagei(rhs_tex, smp_none, (int2)(i+1, w))));
        acc += arm_dot(lhs_buf[h*kslice + i + 2], convert_char4(read_imagei(rhs_tex, smp_none, (int2)(i+2, w))));
        acc += arm_dot(lhs_buf[h*kslice + i + 3], convert_char4(read_imagei(rhs_tex, smp_none, (int2)(i+3, w))));
    }
    out_buf[h*width + w] = convert_char(acc % 53);

    // int4 acc = 0;
    // for(int i=0; i<kslice; i+=4) {
    //     acc = mad24(convert_int4(lhs_buf[h*kslice + i + 0]), convert_int4(read_imagei(rhs_tex, smp_none, (int2)(i+0, w))), acc);
    //     acc = mad24(convert_int4(lhs_buf[h*kslice + i + 1]), convert_int4(read_imagei(rhs_tex, smp_none, (int2)(i+1, w))), acc);
    //     acc = mad24(convert_int4(lhs_buf[h*kslice + i + 2]), convert_int4(read_imagei(rhs_tex, smp_none, (int2)(i+2, w))), acc);
    //     acc = mad24(convert_int4(lhs_buf[h*kslice + i + 3]), convert_int4(read_imagei(rhs_tex, smp_none, (int2)(i+3, w))), acc);
    // }
    // out_buf[h*width + w] = convert_char((acc.x+acc.y+acc.z+acc.w) % 53);

    // float acc = 0;
    // for(int i=0; i<kslice; i+=4) {
    //     acc += dot(convert_float4(lhs_buf[h*kslice + i + 0]), convert_float4(read_imagei(rhs_tex, smp_none, (int2)(i+0, w))));
    //     acc += dot(convert_float4(lhs_buf[h*kslice + i + 1]), convert_float4(read_imagei(rhs_tex, smp_none, (int2)(i+1, w))));
    //     acc += dot(convert_float4(lhs_buf[h*kslice + i + 2]), convert_float4(read_imagei(rhs_tex, smp_none, (int2)(i+2, w))));
    //     acc += dot(convert_float4(lhs_buf[h*kslice + i + 3]), convert_float4(read_imagei(rhs_tex, smp_none, (int2)(i+3, w))));
    // }
    // out_buf[h*width + w] = convert_char((int)(acc) % 53);
}